In [1]:
# !unzip NYPD_Complaint_Data_Histroic_2020_2022.csv.zip
Archive:  NYPD_Complaint_Data_Histroic_2020_2022.csv.zip
  inflating: NYPD_Complaint_Data_Histroic_2020_2022.csv  
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Data understanding - ทำความเข้าใจว่าข้อมูลที่ได้มาเป็นอะไร¶

In [4]:
df = pd.read_csv("NYPD_Complaint_Data_Histroic_2020_2022.csv")
df
Out[4]:
CMPLNT_NUM CMPLNT_FR_DT CMPLNT_FR_TM CMPLNT_TO_DT CMPLNT_TO_TM ADDR_PCT_CD RPT_DT KY_CD OFNS_DESC PD_CD ... SUSP_SEX TRANSIT_DISTRICT Latitude Longitude Lat_Lon PATROL_BORO STATION_NAME VIC_AGE_GROUP VIC_RACE VIC_SEX
0 210240572 02/28/2020 15:15:00 NaN (null) 106.0 02/28/2020 344 ASSAULT 3 & RELATED OFFENSES 101.0 ... M NaN 40.682398 -73.840072 (40.682398279333, -73.8400721577516) PATROL BORO QUEENS SOUTH (null) <18 BLACK M
1 210881300 03/10/2020 21:30:00 NaN (null) 67.0 03/11/2020 578 HARRASSMENT 2 638.0 ... M NaN 40.648851 -73.951017 (40.6488507469884, -73.951016510623) PATROL BORO BKLYN SOUTH (null) 45-64 BLACK M
2 213858183 05/28/2020 08:15:00 NaN (null) 43.0 05/28/2020 107 BURGLARY 231.0 ... (null) NaN 40.822912 -73.870041 (40.8229123084767, -73.8700413043181) PATROL BORO BRONX (null) UNKNOWN UNKNOWN D
3 213831383 05/31/2020 13:45:00 NaN (null) 47.0 05/31/2020 578 HARRASSMENT 2 638.0 ... U NaN 40.887314 -73.847272 (40.8873136344706, -73.8472717577564) PATROL BORO BRONX (null) 25-44 BLACK M
4 213756171 06/01/2020 16:00:00 NaN (null) 52.0 06/01/2020 578 HARRASSMENT 2 638.0 ... (null) NaN 40.869470 -73.879861 (40.8694704770483, -73.8798608037303) PATROL BORO BRONX (null) 25-44 BLACK F
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1395506 261171983 12/30/2022 12:31:00 NaN (null) 106.0 12/31/2022 341 PETIT LARCENY 349.0 ... U NaN 40.675259 -73.854501 (40.67525903, -73.85450113) PATROL BORO QUEENS SOUTH (null) 25-44 ASIAN / PACIFIC ISLANDER F
1395507 261175492 12/31/2022 11:56:00 12/31/2022 17:08:00 44.0 12/31/2022 359 OFFENSES AGAINST PUBLIC ADMINI 759.0 ... F NaN 40.829632 -73.915209 (40.829632, -73.915209) PATROL BORO BRONX (null) 65+ BLACK HISPANIC M
1395508 261147482 12/30/2022 17:20:00 12/30/2022 17:28:00 17.0 12/30/2022 344 ASSAULT 3 & RELATED OFFENSES 101.0 ... M NaN 40.761265 -73.968887 (40.761265, -73.968887) PATROL BORO MAN SOUTH (null) 25-44 BLACK M
1395509 261179651 12/31/2022 20:20:00 12/31/2022 20:26:00 112.0 12/31/2022 348 VEHICLE AND TRAFFIC LAWS 916.0 ... (null) NaN 40.725681 -73.857270 (40.72568117, -73.8572701) PATROL BORO QUEENS NORTH (null) 18-24 WHITE F
1395510 261157928 12/31/2022 04:00:00 NaN (null) 52.0 12/31/2022 106 FELONY ASSAULT 109.0 ... M NaN 40.870942 -73.891499 (40.870942, -73.891499) PATROL BORO BRONX (null) 25-44 BLACK M

1395511 rows × 35 columns

Data cleaning & preprocessing¶

In [64]:
NYPD_df = df[['CMPLNT_FR_TM','RPT_DT','OFNS_DESC','LAW_CAT_CD','VIC_AGE_GROUP','VIC_RACE','VIC_SEX']]
NYPD_df
Out[64]:
CMPLNT_FR_TM RPT_DT OFNS_DESC LAW_CAT_CD VIC_AGE_GROUP VIC_RACE VIC_SEX
0 15:15:00 02/28/2020 ASSAULT 3 & RELATED OFFENSES MISDEMEANOR <18 BLACK M
1 21:30:00 03/11/2020 HARRASSMENT 2 VIOLATION 45-64 BLACK M
2 08:15:00 05/28/2020 BURGLARY FELONY UNKNOWN UNKNOWN D
3 13:45:00 05/31/2020 HARRASSMENT 2 VIOLATION 25-44 BLACK M
4 16:00:00 06/01/2020 HARRASSMENT 2 VIOLATION 25-44 BLACK F
... ... ... ... ... ... ... ...
1395506 12:31:00 12/31/2022 PETIT LARCENY MISDEMEANOR 25-44 ASIAN / PACIFIC ISLANDER F
1395507 11:56:00 12/31/2022 OFFENSES AGAINST PUBLIC ADMINI MISDEMEANOR 65+ BLACK HISPANIC M
1395508 17:20:00 12/30/2022 ASSAULT 3 & RELATED OFFENSES MISDEMEANOR 25-44 BLACK M
1395509 20:20:00 12/31/2022 VEHICLE AND TRAFFIC LAWS MISDEMEANOR 18-24 WHITE F
1395510 04:00:00 12/31/2022 FELONY ASSAULT FELONY 25-44 BLACK M

1395511 rows × 7 columns

In [65]:
crime_mapping = {
    'HARRASSMENT 2': 'HARASSMENT',
    'ESCAPE 3': 'ESCAPE',
    'ASSAULT 3 & RELATED OFFENSES': 'ASSAULT & RELATED OFFENSES',
    'CRIMINAL MISCHIEF & RELATED OF': 'CRIMINAL MISCHIEF',
    'OFF. AGNST PUB ORD SENSBLTY &': 'OFFENSES AGAINST PUBLIC ORDER/ADMINISTRATION',
    'OTHER STATE LAWS (NON PENAL LA': 'OTHER STATE LAWS (NON PENAL LAW)',
    'ENDAN WELFARE INCOMP': 'ENDANGERING WELFARE OF INCOMPETENT',
    'AGRICULTURE & MRKTS LAW-UNCLASSIFIED': 'AGRICULTURE & MARKETS LAW',
    'DISRUPTION OF A RELIGIOUS SERV': 'DISRUPTION OF A RELIGIOUS SERVICE',
    'LOITERING/GAMBLING (CARDS, DIC': 'GAMBLING',
    'OFFENSES AGAINST MARRIAGE UNCL': 'OFFENSES AGAINST MARRIAGE',
    'HOMICIDE-NEGLIGENT,UNCLASSIFIE': 'HOMICIDE-NEGLIGENT',
    'E': 'UNKNOWN',
    'D': 'BUSINESS/ORGANIZATION',
    'F': 'FEMALE',
    'M': 'MALE'
}
In [66]:
def get_map(x):
    if x not in crime_mapping : return x
    return crime_mapping.get(x, x)

NYPD_df["OFNS_DESC"] = NYPD_df["OFNS_DESC"].map(get_map)
print(NYPD_df["OFNS_DESC"].value_counts())
PETIT LARCENY                        284870
HARASSMENT                           224636
ASSAULT & RELATED OFFENSES           146673
CRIMINAL MISCHIEF                    136687
GRAND LARCENY                        128647
                                      ...  
NEW YORK CITY HEALTH CODE                13
INTOXICATED/IMPAIRED DRIVING              8
DISRUPTION OF A RELIGIOUS SERVICE         4
FORTUNE TELLING                           1
OFFENSES AGAINST MARRIAGE                 1
Name: OFNS_DESC, Length: 63, dtype: int64
<ipython-input-66-1808c16af01e>:5: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [67]:
sex_mapping = {
    'F': 'FEMALE',
    'M': 'MALE'
}

def sex_map(x):
    if x not in sex_mapping : return "UNKNOWN"
    return sex_mapping.get(x, x)

NYPD_df["VIC_SEX"] = NYPD_df["VIC_SEX"].map(sex_map)
print(NYPD_df["VIC_SEX"].value_counts())
FEMALE     550097
MALE       495695
UNKNOWN    349719
Name: VIC_SEX, dtype: int64
<ipython-input-67-f1aca97aca78>:10: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [68]:
NYPD_df
Out[68]:
CMPLNT_FR_TM RPT_DT OFNS_DESC LAW_CAT_CD VIC_AGE_GROUP VIC_RACE VIC_SEX
0 15:15:00 02/28/2020 ASSAULT & RELATED OFFENSES MISDEMEANOR <18 BLACK MALE
1 21:30:00 03/11/2020 HARASSMENT VIOLATION 45-64 BLACK MALE
2 08:15:00 05/28/2020 BURGLARY FELONY UNKNOWN UNKNOWN UNKNOWN
3 13:45:00 05/31/2020 HARASSMENT VIOLATION 25-44 BLACK MALE
4 16:00:00 06/01/2020 HARASSMENT VIOLATION 25-44 BLACK FEMALE
... ... ... ... ... ... ... ...
1395506 12:31:00 12/31/2022 PETIT LARCENY MISDEMEANOR 25-44 ASIAN / PACIFIC ISLANDER FEMALE
1395507 11:56:00 12/31/2022 OFFENSES AGAINST PUBLIC ADMINI MISDEMEANOR 65+ BLACK HISPANIC MALE
1395508 17:20:00 12/30/2022 ASSAULT & RELATED OFFENSES MISDEMEANOR 25-44 BLACK MALE
1395509 20:20:00 12/31/2022 VEHICLE AND TRAFFIC LAWS MISDEMEANOR 18-24 WHITE FEMALE
1395510 04:00:00 12/31/2022 FELONY ASSAULT FELONY 25-44 BLACK MALE

1395511 rows × 7 columns

In [69]:
NYPD_df.isna().sum()
Out[69]:
CMPLNT_FR_TM     0
RPT_DT           0
OFNS_DESC        0
LAW_CAT_CD       0
VIC_AGE_GROUP    0
VIC_RACE         0
VIC_SEX          0
dtype: int64
In [70]:
NYPD_df['OFNS_DESC'].value_counts()
Out[70]:
PETIT LARCENY                        284870
HARASSMENT                           224636
ASSAULT & RELATED OFFENSES           146673
CRIMINAL MISCHIEF                    136687
GRAND LARCENY                        128647
                                      ...  
NEW YORK CITY HEALTH CODE                13
INTOXICATED/IMPAIRED DRIVING              8
DISRUPTION OF A RELIGIOUS SERVICE         4
FORTUNE TELLING                           1
OFFENSES AGAINST MARRIAGE                 1
Name: OFNS_DESC, Length: 63, dtype: int64

Exploratory Analysis - ทำการ plot graphs เพื่อใช้เป็นหลักฐานสนับสนุนการตัดสินใจทางธุรกิจ¶

- Type of Crimes
- Analyzing a Specific Crime
In [71]:
import plotly.express as px

fig = px.histogram(NYPD_df, x='OFNS_DESC', color='VIC_SEX')
fig.show()

Define a research question - ตั้งคำถามจากข้อมูลที่ได้ โดยที่คำตอบของคำถามสามารถนำไปตัดสินใจได้¶

แนวโน้มอาชกรรม Harrassment ในเพศหญิงในแต่ละปีมีการเพิ่มขึ้นหรือลดลง? เพื่อนำไปใช้ในการช่วยสร้างมาตรการป้องกันในอนาคต เช่นการรณนรงค์ของกระทรวงอาชญกากกรม

Data Analytics¶

- Statistical analysis - ทำการวิเคราะห์ข้อมูลทางสถิติเพื่อใช้เป็นหลักฐานสนับสนุนการตัดสินใจทางธุรกิจ
- Data Aggregation - ทำการจับกลุ่มเพื่อสรุปหา insights ที่น่าสนใจ
- Time Series Analysis - ทำการดู trend สิ่งที่สนใจว่ามีการเปลี่ยนแปลงไปอย่างไร หรือมีแนวโน้มอย่างไรในอนาคต

Statistical analysis¶

In [84]:
har_df = NYPD_df[['OFNS_DESC', 'RPT_DT', 'VIC_SEX']][NYPD_df['OFNS_DESC']=='HARASSMENT']
har_df['VIC_SEX'].value_counts()
Out[84]:
FEMALE     142271
MALE        80200
UNKNOWN      2165
Name: VIC_SEX, dtype: int64
In [110]:
har_df['OFNS_DESC'].value_counts()
Out[110]:
HARASSMENT    224636
Name: OFNS_DESC, dtype: int64
In [111]:
har_df['VIC_SEX'].value_counts()
Out[111]:
FEMALE     142271
MALE        80200
UNKNOWN      2165
Name: VIC_SEX, dtype: int64
In [112]:
har_df.groupby(by=['VIC_SEX','OFNS_DESC'])['VIC_SEX'].count()
Out[112]:
VIC_SEX  OFNS_DESC 
FEMALE   HARASSMENT    142271
MALE     HARASSMENT     80200
UNKNOWN  HARASSMENT      2165
Name: VIC_SEX, dtype: int64

Time Series Analysis¶

In [87]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
In [97]:
har_df['RPT_DT'] = pd.to_datetime(har_df['RPT_DT'])

har_df['RPT_DT'] = [x.strftime("%Y") for x in har_df['RPT_DT']]

har_df
Out[97]:
OFNS_DESC RPT_DT VIC_SEX
1 HARASSMENT 2020 MALE
3 HARASSMENT 2020 MALE
4 HARASSMENT 2020 FEMALE
7 HARASSMENT 2020 FEMALE
10 HARASSMENT 2020 FEMALE
... ... ... ...
1395491 HARASSMENT 2022 MALE
1395493 HARASSMENT 2022 FEMALE
1395498 HARASSMENT 2022 FEMALE
1395504 HARASSMENT 2022 FEMALE
1395505 HARASSMENT 2022 FEMALE

224636 rows × 3 columns

Data Aggregation¶

In [105]:
# print(har_df.pivot_table(index=["RPT_DT"], columns="VIC_SEX",
#                        values="OFNS_DESC", aggfunc=["count"], margins=True))

har_pivot = har_df.pivot_table(index=["RPT_DT"], columns="VIC_SEX",
                       values="OFNS_DESC", aggfunc=["count"])
har_pivot
Out[105]:
count
VIC_SEX FEMALE MALE UNKNOWN
RPT_DT
2020 42457 23750 592
2021 47384 26726 711
2022 52430 29724 862
In [108]:
har_pivot.unstack()
Out[108]:
       VIC_SEX  RPT_DT
count  FEMALE   2020      42457
                2021      47384
                2022      52430
       MALE     2020      23750
                2021      26726
                2022      29724
       UNKNOWN  2020        592
                2021        711
                2022        862
dtype: int64

Storytelling - สรุป insights ที่เราพบให้ดูเข้าใจง่าย¶

ดูเหมือนว่าการก่ออาชญากรรมในเมืองนิวยอกนั้นมีประเภทการกระทำความผิดหลากหลายรูปเเบบซึ่งในส่วนของการล่วงละเมิดทางเพศนั้นมีจำนวนมากเป็นอันดับต้นๆในการกระทำความผิด ซึ่งระยะเวลาไม่ได้ช่วยให้การจำนวนการล่วงละเมิดทางเพศนั้นลดลง อีกทั้งเพศหญิงยังคงมีสถิติการถูกล่วงละเมิดมากกว่าเพศชายอีกด้วย